Data Munging Practice

Practicing data wrangling and cleaning using various Python libraries to improve my data science practices and create some reusable code I can use for future projects.



In [1]:

    
import numpy as np
import pandas as pd

import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

Automating Data Extraction



In [22]:

    
import os
import urllib

root_url = "https://archive.ics.uci.edu/ml/machine-learning-databases/"
full_url = root_url + "cylinder-bands/bands.data"
dataset_location = os.path.join("datasets","cylinder-bands")
filename = 'cylinder-bands.csv'

def fetch_data(filename, full_url=full_url, dataset_location=dataset_location):
    if not os.path.isdir(dataset_location):
        os.makedirs(dataset_location)
    file_path = os.path.join(dataset_location, filename)
    urllib.request.urlretrieve(full_url, file_path)



In [23]:

    
fetch_data("cylinder-bands.csv")



In [24]:

    
def load_data(filename, dataset_location=dataset_location):
    csv_path = os.path.join(dataset_location, filename)
    return pd.read_csv(csv_path, header=None)



In [25]:

    
cylinder_bands = load_data("cylinder-bands.csv")
cylinder_bands.head()









    Out[25]:







  
    
      
      0
      1
      2
      3
      4
      5
      6
      7
      8
      9
      ...
      30
      31
      32
      33
      34
      35
      36
      37
      38
      39
    
  
  
    
      0
      19910108
      X126
      TVGUIDE
      25503
      YES
      KEY
      YES
      BENTON
      GALLATIN
      UNCOATED
      ...
      36.4
      0
      0
      2.5
      1
      34
      40
      105
      100
      band
    
    
      1
      19910109
      X266
      TVGUIDE
      25503
      YES
      KEY
      YES
      BENTON
      GALLATIN
      UNCOATED
      ...
      38.5
      0
      0
      2.5
      0.7
      34
      40
      105
      100
      noband
    
    
      2
      19910104
      B7
      MODMAT
      47201
      YES
      KEY
      YES
      BENTON
      GALLATIN
      UNCOATED
      ...
      39.8
      0
      0
      2.8
      0.9
      40
      40
      103.87
      100
      noband
    
    
      3
      19910104
      T133
      MASSEY
      39039
      YES
      KEY
      YES
      BENTON
      GALLATIN
      UNCOATED
      ...
      38.8
      0
      0
      2.5
      1.3
      40
      40
      108.06
      100
      noband
    
    
      4
      19910111
      J34
      KMART
      37351
      NO
      KEY
      YES
      BENTON
      GALLATIN
      UNCOATED
      ...
      42.5
      5
      0
      2.3
      0.6
      35
      40
      106.67
      100
      noband
    
  

5 rows × 40 columns



In [26]:

    
attributes = [
    'timestamp',
    'cylinder number',
    'customer',
    'job number',
    'grain screened',
    'ink color',
    'proof on ctd ink',
    'blade mfg',
    'cylinder divisional',
    'paper type',
    'ink type',
    'direct stream',
    'solvent type',
    'type on cylinder',
    'press type',
    'press',
    'unit number',
    'cylinder size',
    'paper mill location',
    'plating tank',
    'proof cut',
    'viscosity',
    'caliper',
    'ink temperature',
    'humidity',
    'roughness',
    'blade pressure',
    'varnish pct',
    'press speed',
    'ink pct',
    'solvent pct',
    'ESA Voltage',
    'ESA Amperage',
    'wax',
    'hardener',
    'roller durometer',
    'current density',
    'anode space ratio',
    'chrome content',
    'band type',
]



In [27]:

    
len(attributes)









    Out[27]:





40



In [28]:

    
cylinder_bands.columns = attributes



In [29]:

    
cylinder_bands.head()









    Out[29]:







  
    
      
      timestamp
      cylinder number
      customer
      job number
      grain screened
      ink color
      proof on ctd ink
      blade mfg
      cylinder divisional
      paper type
      ...
      solvent pct
      ESA Voltage
      ESA Amperage
      wax
      hardener
      roller durometer
      current density
      anode space ratio
      chrome content
      band type
    
  
  
    
      0
      19910108
      X126
      TVGUIDE
      25503
      YES
      KEY
      YES
      BENTON
      GALLATIN
      UNCOATED
      ...
      36.4
      0
      0
      2.5
      1
      34
      40
      105
      100
      band
    
    
      1
      19910109
      X266
      TVGUIDE
      25503
      YES
      KEY
      YES
      BENTON
      GALLATIN
      UNCOATED
      ...
      38.5
      0
      0
      2.5
      0.7
      34
      40
      105
      100
      noband
    
    
      2
      19910104
      B7
      MODMAT
      47201
      YES
      KEY
      YES
      BENTON
      GALLATIN
      UNCOATED
      ...
      39.8
      0
      0
      2.8
      0.9
      40
      40
      103.87
      100
      noband
    
    
      3
      19910104
      T133
      MASSEY
      39039
      YES
      KEY
      YES
      BENTON
      GALLATIN
      UNCOATED
      ...
      38.8
      0
      0
      2.5
      1.3
      40
      40
      108.06
      100
      noband
    
    
      4
      19910111
      J34
      KMART
      37351
      NO
      KEY
      YES
      BENTON
      GALLATIN
      UNCOATED
      ...
      42.5
      5
      0
      2.3
      0.6
      35
      40
      106.67
      100
      noband
    
  

5 rows × 40 columns

Reformatting Data Types



In [30]:

    
cylinder_bands.info()









    



<class 'pandas.core.frame.DataFrame'>
RangeIndex: 541 entries, 0 to 540
Data columns (total 40 columns):
timestamp              541 non-null object
cylinder number        541 non-null object
customer               541 non-null object
job number             541 non-null object
grain screened         541 non-null object
ink color              541 non-null object
proof on ctd ink       541 non-null object
blade mfg              541 non-null object
cylinder divisional    541 non-null object
paper type             541 non-null object
ink type               541 non-null object
direct stream          541 non-null object
solvent type           541 non-null object
type on cylinder       541 non-null object
press type             541 non-null object
press                  541 non-null object
unit number            541 non-null float64
cylinder size          540 non-null object
paper mill location    540 non-null object
plating tank           540 non-null object
proof cut              540 non-null object
viscosity              540 non-null object
caliper                540 non-null object
ink temperature        539 non-null object
humidity               539 non-null object
roughness              539 non-null object
blade pressure         539 non-null object
varnish pct            539 non-null object
press speed            539 non-null object
ink pct                539 non-null object
solvent pct            539 non-null object
ESA Voltage            539 non-null object
ESA Amperage           539 non-null object
wax                    539 non-null object
hardener               539 non-null object
roller durometer       539 non-null object
current density        539 non-null object
anode space ratio      539 non-null object
chrome content         539 non-null object
band type              539 non-null object
dtypes: float64(1), object(39)
memory usage: 169.1+ KB



In [31]:

    
dt_attributes = [
    'timestamp',
]

cat_attributes = [
    'cylinder number',
    'customer',
    'job number',
    'grain screened',
    'ink color',
    'proof on ctd ink',
    'blade mfg',
    'cylinder divisional',
    'paper type',
    'ink type',
    'direct stream',
    'solvent type',
    'type on cylinder',
    'press type',
    'press',
    'unit number',
    'cylinder size',
    'paper mill location',
    'plating tank',
]

num_attributes = [
    'proof cut',
    'viscosity',
    'caliper',
    'ink temperature',
    'humidity',
    'roughness',
    'blade pressure',
    'varnish pct',
    'press speed',
    'ink pct',
    'solvent pct',
    'ESA Voltage',
    'ESA Amperage',
    'wax',
    'hardener',
    'roller durometer',
    'current density',
    'anode space ratio',
    'chrome content',
]



In [32]:

    
def contextual_convert(df, dt_attributes=dt_attributes, num_attributes=num_attributes, cat_attributes=cat_attributes):
    attributes = dt_attributes + num_attributes + cat_attributes
    for att in attributes:
        if att in dt_attributes:
            df[att] = pd.to_datetime(df[att], format="%Y%m%d", errors='coerce')
        elif att in num_attributes:
            df[att] = pd.to_numeric(df[att], errors='coerce')
        elif att in cat_attributes:
            df[att] = df[att].astype('category')
        else:
            pass



In [33]:

    
contextual_convert(cylinder_bands)



In [34]:

    
cylinder_bands[dt_attributes].info()









    



<class 'pandas.core.frame.DataFrame'>
RangeIndex: 541 entries, 0 to 540
Data columns (total 1 columns):
timestamp    540 non-null datetime64[ns]
dtypes: datetime64[ns](1)
memory usage: 4.3 KB



In [35]:

    
cylinder_bands[cat_attributes].info()









    



<class 'pandas.core.frame.DataFrame'>
RangeIndex: 541 entries, 0 to 540
Data columns (total 19 columns):
cylinder number        541 non-null category
customer               541 non-null category
job number             541 non-null category
grain screened         541 non-null category
ink color              541 non-null category
proof on ctd ink       541 non-null category
blade mfg              541 non-null category
cylinder divisional    541 non-null category
paper type             541 non-null category
ink type               541 non-null category
direct stream          541 non-null category
solvent type           541 non-null category
type on cylinder       541 non-null category
press type             541 non-null category
press                  541 non-null category
unit number            541 non-null category
cylinder size          540 non-null category
paper mill location    540 non-null category
plating tank           540 non-null category
dtypes: category(19)
memory usage: 53.7 KB



In [36]:

    
cylinder_bands[num_attributes].info()









    



<class 'pandas.core.frame.DataFrame'>
RangeIndex: 541 entries, 0 to 540
Data columns (total 19 columns):
proof cut            486 non-null float64
viscosity            535 non-null float64
caliper              512 non-null float64
ink temperature      537 non-null float64
humidity             538 non-null float64
roughness            509 non-null float64
blade pressure       476 non-null float64
varnish pct          484 non-null float64
press speed          529 non-null float64
ink pct              484 non-null float64
solvent pct          484 non-null float64
ESA Voltage          483 non-null float64
ESA Amperage         485 non-null float64
wax                  533 non-null float64
hardener             532 non-null float64
roller durometer     485 non-null float64
current density      532 non-null float64
anode space ratio    532 non-null float64
chrome content       536 non-null float64
dtypes: float64(19)
memory usage: 80.4 KB



In [37]:

    
cylinder_bands['customer'].str.upper()









    Out[37]:





0            TVGUIDE
1            TVGUIDE
2             MODMAT
3             MASSEY
4              KMART
5             MASSEY
6              ROSES
7              ROSES
8             MODMAT
9         CHILDCRAFT
10        CHILDCRAFT
11       HANOVRHOUSE
12        HANOVRHOUS
13        GUIDEPOSTS
14        HANOVRHOUS
15            MODMAT
16          HOMESHOP
17          HOMESHOP
18             USCAV
19             USCAV
20          COLORTIL
21          COLORTIL
22             WARDS
23            TARGET
24             KMART
25             KMART
26             KMART
27             KMART
28         WOOLWORTH
29              AMES
           ...      
511          VENTURE
512         HOMESHOP
513         HOMESHOP
514           MODMAT
515           TARGET
516              JFK
517    COLORFULIMAGE
518         BESTPROD
519            KMART
520          TOYSRUS
521          ECKERDS
522             1910
523           MODMAT
524            KMART
525            KMART
526           MODMAT
527          TOYSRUS
528         HOMESHOP
529          TOYSRUS
530              JCP
531          WALMART
532            KMART
533           ECKERD
534           ECKERD
535            KMART
536            KMART
537             BEST
538            KMART
539            KMART
540             BEST
Name: customer, Length: 541, dtype: object



In [38]:

    
cylinder_bands['customer'].value_counts()









    Out[38]:





MODMAT            61
KMART             54
TARGET            40
TVGUIDE           38
WARDS             33
AMES              24
TOYSRUS           23
ROSES             22
kmart             13
BESTPROD           9
SHEPLERS           9
ECKERD             9
WOOLWORTH          9
CHILDCRAFT         9
WALMART            9
AUSTADS            8
HILLS              8
MASSEY             7
ECKERDS            7
DOWNS              7
GUIDEPOSTS         6
USCAV              6
REI                5
COLORTILE          5
HANOVERHOUSE       4
BELK               4
homeshop           4
DUNNS              4
SEARS              4
ABBEY              4
                  ..
HOMESHOP           2
HOMESHOPPING       2
WOOLWRTH           2
CAMPINGWORLD       2
JCPENNY            2
BURDINES           2
SERVMERCH          2
LAZARUS            2
OLDVILLAGESHOP     2
BRENDLS            2
COLORTIL           2
ADCO               1
walmart            1
GALLS              1
colorfulimage      1
target             1
jfk                1
jcp                1
homeshopping       1
hanoverhouse       1
eckerds            1
YIELDHOUSE         1
GLOBAL             1
PENNEY             1
NTLWILDLIFE        1
KIDSRUS            1
venture            1
HANOVRHOUSE        1
GLOBALEQUP         1
1910               1
Name: customer, Length: 84, dtype: int64



In [53]:

    
def capitalize_columns(df, cols):
    for col in cols:
        df[col] = df[col].str.upper()



In [39]:

    
cylinder_bands['customer'] = cylinder_bands['customer'].str.upper()



In [54]:

    
capitalize_columns(cylinder_bands, ['customer', 'cylinder number'])

Preprocessing Pipelines



In [58]:

    
from category_encoders import OrdinalEncoder, OneHotEncoder



In [ ]:

    
from sklearn.base import BaseEstimator, TransformerMixin

# Create a class to select numerical or categorical columns 
# since Scikit-Learn doesn't handle DataFrames yet
class DataFrameSelector(BaseEstimator, TransformerMixin):
    def __init__(self, attribute_names):
        self.attribute_names = attribute_names
    def fit(self, X, y=None):
        return self
    def transform(self, X):
        return X[self.attribute_names].values

Scratch Work



In [65]:

    
cylinder_bands['cylinder size'].value_counts()









    Out[65]:





TABLOID    281
CATALOG    164
SPIEGEL     54
tabloid     17
catalog     14
spiegel      6
?            3
0.7          1
Name: cylinder size, dtype: int64



In [74]:

    
cylinder_bands['plating tank'].value_counts()









    Out[74]:





1910    331
1911    190
?        18
40        1
Name: plating tank, dtype: int64



In [67]:

    
cylinder_bands[cylinder_bands['cylinder size'] == "0.7"]









    Out[67]:







  
    
      
      timestamp
      cylinder number
      customer
      job number
      grain screened
      ink color
      proof on ctd ink
      blade mfg
      cylinder divisional
      paper type
      ...
      solvent pct
      ESA Voltage
      ESA Amperage
      wax
      hardener
      roller durometer
      current density
      anode space ratio
      chrome content
      band type
    
  
  
    
      522
      NaT
      ?
      1910
      ?
      45
      0.200
      17
      84
      0.8125
      27
      ...
      NaN
      NaN
      NaN
      NaN
      NaN
      NaN
      NaN
      NaN
      NaN
      NaN
    
  

1 rows × 40 columns



In [68]:

    
raw_cylinder_bands = load_data(filename)



In [70]:

    
raw_cylinder_bands.iloc[522]









    Out[70]:





0     tabloid
1           ?
2        1910
3           ?
4          45
5       0.200
6          17
7          84
8      0.8125
9          27
10          ?
11       1865
12          ?
13          ?
14          ?
15          ?
16        1.5
17        0.7
18          ?
19         40
20      109.1
21         95
22       band
23        NaN
24        NaN
25        NaN
26        NaN
27        NaN
28        NaN
29        NaN
30        NaN
31        NaN
32        NaN
33        NaN
34        NaN
35        NaN
36        NaN
37        NaN
38        NaN
39        NaN
Name: 522, dtype: object

	0	1	2	3	4	5	6	7	8	9	...	30	31	33	34	35	36	37	38	39
0	19910108	X126	TVGUIDE	25503	YES	KEY	YES	BENTON	GALLATIN	UNCOATED	...	36.4	0	2.5	1	34	40	105	100	band
1	19910109	X266	TVGUIDE	25503	YES	KEY	YES	BENTON	GALLATIN	UNCOATED	...	38.5	0	2.5	0.7	34	40	105	100	noband
2	19910104	B7	MODMAT	47201	YES	KEY	YES	BENTON	GALLATIN	UNCOATED	...	39.8	0	2.8	0.9	40	40	103.87	100	noband
3	19910104	T133	MASSEY	39039	YES	KEY	YES	BENTON	GALLATIN	UNCOATED	...	38.8	0	2.5	1.3	40	40	108.06	100	noband
4	19910111	J34	KMART	37351	NO	KEY	YES	BENTON	GALLATIN	UNCOATED	...	42.5	5	2.3	0.6	35	40	106.67	100	noband